{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "edab3cd3-e5a8-4b63-8b8d-c9ef758571b8",
   "metadata": {},
   "source": [
    "# Baseline model for batch monitoring example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b54264e6eb117908",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import datetime\n",
    "import pandas as pd\n",
    "\n",
    "from evidently import DataDefinition\n",
    "from evidently import Dataset\n",
    "from evidently import Report\n",
    "from evidently.metrics import ValueDrift, DriftedColumnsCount, MissingValueCount\n",
    "\n",
    "from joblib import load, dump\n",
    "from tqdm import tqdm\n",
    "\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46aeb5ac-bfb7-4fe4-b732-6bf372ba2e95",
   "metadata": {},
   "outputs": [],
   "source": [
    "! mkdir data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c935a6bc588248d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "files = [('green_tripdata_2022-02.parquet', './data'), ('green_tripdata_2022-01.parquet', './data')]\n",
    "\n",
    "print(\"Download files:\")\n",
    "for file, path in files:\n",
    "    url=f\"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}\"\n",
    "    resp=requests.get(url, stream=True)\n",
    "    save_path=f\"{path}/{file}\"\n",
    "    with open(save_path, \"wb\") as handle:\n",
    "        for data in tqdm(resp.iter_content(),\n",
    "                        desc=f\"{file}\",\n",
    "                        postfix=f\"save to {save_path}\",\n",
    "                        total=int(resp.headers[\"Content-Length\"])):\n",
    "            handle.write(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10f4d69997668ea8",
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "755c48c29d032e40",
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abc417723077a646",
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "281a27332b636f09",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create target\n",
    "jan_data[\"duration_min\"] = jan_data.lpep_dropoff_datetime - jan_data.lpep_pickup_datetime\n",
    "jan_data.duration_min = jan_data.duration_min.apply(lambda td : float(td.total_seconds())/60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53880bc897446f97",
   "metadata": {},
   "outputs": [],
   "source": [
    "# filter out outliers\n",
    "jan_data = jan_data[(jan_data.duration_min >= 0) & (jan_data.duration_min <= 60)]\n",
    "jan_data = jan_data[(jan_data.passenger_count > 0) & (jan_data.passenger_count <= 8)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6081a7e81e9f56f",
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_data.duration_min.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c36e0ebfb0879ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# data labeling\n",
    "target = \"duration_min\"\n",
    "num_features = [\"passenger_count\", \"trip_distance\", \"fare_amount\", \"total_amount\"]\n",
    "cat_features = [\"PULocationID\", \"DOLocationID\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "553a23ed50b7c326",
   "metadata": {},
   "outputs": [],
   "source": [
    "jan_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b2d012fb9b1894a",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = jan_data[:30000]\n",
    "val_data = jan_data[30000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acbd0bcaea363591",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2071678344a3f82e",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(train_data[num_features + cat_features], train_data[target])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2b35c8b807bbd0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_preds = model.predict(train_data[num_features + cat_features])\n",
    "train_data['prediction'] = train_preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "775faa558e6ed9f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "val_preds = model.predict(val_data[num_features + cat_features])\n",
    "val_data['prediction'] = val_preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e675db84c123c9db",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(mean_absolute_error(train_data.duration_min, train_data.prediction))\n",
    "print(mean_absolute_error(val_data.duration_min, val_data.prediction))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "594f99e8-3667-4108-adf3-d9509150924d",
   "metadata": {},
   "source": [
    "# Dump model and reference data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b323d6bb-7cbc-4b0b-893c-88f9f1b54a31",
   "metadata": {},
   "outputs": [],
   "source": [
    "! mkdir models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a41101a02592a969",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('models/lin_reg.bin', 'wb') as f_out:\n",
    "    dump(model, f_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ba1f803d2d9ddf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "val_data.to_parquet('data/reference.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c857c5683c59b65",
   "metadata": {},
   "source": [
    "# Evidently Report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e10db1a0250c1ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_definition = DataDefinition(numerical_columns=num_features + ['prediction'], categorical_columns=cat_features)\n",
    "train_dataset = Dataset.from_pandas(\n",
    "    train_data,\n",
    "    data_definition\n",
    ")\n",
    "\n",
    "val_dataset = Dataset.from_pandas(\n",
    "    val_data,\n",
    "    data_definition\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "543ca2a7e4f00a45",
   "metadata": {},
   "outputs": [],
   "source": [
    "report = Report(metrics=[\n",
    "    ValueDrift(column='prediction'),\n",
    "    DriftedColumnsCount(),\n",
    "    MissingValueCount(column='prediction'),\n",
    "]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "676afb69b6172332",
   "metadata": {},
   "outputs": [],
   "source": [
    "snapshot = report.run(reference_data=train_dataset, current_data=val_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69ad038a3a95098a",
   "metadata": {},
   "outputs": [],
   "source": [
    "snapshot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9e2aa5ca9b496d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = snapshot.dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e5035dd2c278a9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e899f3602bfff467",
   "metadata": {},
   "outputs": [],
   "source": [
    "#prediction drift\n",
    "result['metrics'][0]['value']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1259822acaefed1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#number of drifted columns\n",
    "result['metrics'][1]['value']['count']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3addf64d2dc04ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "#share of missing values\n",
    "result['metrics'][2]['value']['count']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2f9038a218615f1",
   "metadata": {},
   "source": [
    "# Evidently Dashboard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb2023542b25fd91",
   "metadata": {},
   "outputs": [],
   "source": [
    "from evidently.presets import DataDriftPreset, DataSummaryPreset\n",
    "\n",
    "from evidently.ui.workspace import Workspace\n",
    "from evidently.sdk.panels import *\n",
    "from evidently.legacy.renderers.html_widgets import WidgetSize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad1e20608545b2ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "ws = Workspace(\"workspace\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e953dfbe75da3c06",
   "metadata": {},
   "outputs": [],
   "source": [
    "project = ws.create_project(\"NYC Taxi Data Quality Project\")\n",
    "project.description = \"My project description\"\n",
    "project.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b50a04f69b18b46e",
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_report = Report(\n",
    "    metrics=[\n",
    "        DataSummaryPreset()\n",
    "    ],\n",
    ")\n",
    "\n",
    "data = Dataset.from_pandas(\n",
    "    val_data.loc[val_data.lpep_pickup_datetime.between('2022-01-28', '2022-01-29', inclusive=\"left\")],\n",
    "    data_definition=data_definition,\n",
    ")\n",
    "\n",
    "regular_snapshot = regular_report.run(current_data=data, timestamp=datetime.datetime(2022,1,28))\n",
    "\n",
    "regular_snapshot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11d1f0f5a3a478a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "ws.add_run(project.id, regular_snapshot)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15c502ed-a2d6-4de9-866a-5eca46c354ce",
   "metadata": {},
   "source": [
    "note: To view a report please run \"evidently ui\" command in a separate tab in your terminal."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9ffc785f07a3b43",
   "metadata": {},
   "outputs": [],
   "source": [
    "#configure the dashboard\n",
    "project.dashboard.add_panel(\n",
    "    text_panel(title=\"NYC taxi data dashboard\")\n",
    ")\n",
    "\n",
    "project.dashboard.add_panel(\n",
    "    bar_plot_panel(\n",
    "        title=\"Inference Count\",\n",
    "        values=[\n",
    "            PanelMetric(\n",
    "                metric=\"RowCount\",\n",
    "                legend=\"count\",\n",
    "            ),\n",
    "        ],\n",
    "        size=\"half\",\n",
    "    ),\n",
    ")\n",
    "\n",
    "project.dashboard.add_panel(\n",
    "    line_plot_panel(\n",
    "        title=\"Number of Missing Values\",\n",
    "        values=[\n",
    "            PanelMetric(\n",
    "                metric=\"DatasetMissingValueCount\",\n",
    "                legend=\"count\"\n",
    "            ),\n",
    "        ],\n",
    "        size=\"half\",\n",
    "    ),\n",
    ")\n",
    "\n",
    "project.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e57e1bcc-4561-4e23-829b-3a635c989b2b",
   "metadata": {},
   "source": [
    "To view a dashboard please run \"evidently ui\" command in a separate tab in your terminal."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef86fd9791faf042",
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_report = Report(\n",
    "    metrics=[\n",
    "        DataSummaryPreset()\n",
    "    ],\n",
    ")\n",
    "\n",
    "data = Dataset.from_pandas(\n",
    "    val_data.loc[val_data.lpep_pickup_datetime.between('2022-01-29', '2022-01-30', inclusive=\"left\")],\n",
    "    data_definition=data_definition,\n",
    ")\n",
    "\n",
    "regular_run = regular_report.run(current_data=data, timestamp=datetime.datetime(2022,1,29))\n",
    "\n",
    "regular_run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42a513dc53964471",
   "metadata": {},
   "outputs": [],
   "source": [
    "ws.add_run(project.id, regular_run)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7da8c7d2911c5f1e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
